; Vector3: X right, Y down, Z forward.
; On the FP stack and in memory it looks like {Y X Z} (Y is used the most).
; (u|v) is the dot product: ux*vx + uy*vy + uz*vz.

org 100h ; assume al=0 bx=0 sp=di=-2 si=0100h bp=09??h
  dw 12        ;=0C 00  or al,0

  mov al,13h
  dec di
  dec di
DI_ equ -4     ; pixel_adr@di = -4
P:shr cl,1
  add cl,[si]  ; B@cl = 12..20..43,43..12

  int 10h      ; video mode / set palette index: bx=i dh=R ch=G cl=B

  mov al,bl
  cbw
  xor al,ah    ; 0..127,127..0
  mov ch,al
  shr ch,1     ; G@ch = 0..63,63..0
  mov dh,ch    ; R@dh = 0..63,63..0
  mul al       ; ah = 0..16..63,63..16..0
  
  or bl,bl
BIG equ $-1 ;31195
  jns S
  xchg ax,dx   ; R@dh = 0..63,63..16..0
S:mov cl,ah    ; B@cl = 0..16..63,63..0

  mov ax,1010h
  inc bx
  jnz P        ;bx=0 cx=0

;Each frame: Generate gem normals to p0..p12=[bp+200h,300h,...].
M:pusha  ; adr:   -18 -16 -14 -12 -10  -8  -6  -4  -2
         ; stack:  di  si  bp  sp  bx  dx  cx  ax   0
         ; data:   -4 100 9??  -2  0  9fca T  key
  mov cx,[si]
G:add bp,si    ; i@cx = 12...1; bp points to p[12-i]; carry=0
  pusha

;Dodecahedron planes with unit normals.
  fld1         ; exact would be atan((1+sqrt5)/2)=1.0172rad
  fsincos
  fldz               ;|0 0.540 0.841 (exact: 0.526 0.851)
N:test cl,1          ;|a b c
  jnz K
  fchs
K:fstp st3           ;|b c +-a
  loop N  ;cl=0 cf=0 ;|y x z

;Do a bunch of rotations. It doesn't need to be fast.
; y x z --> cx-sz cz+sx y
  mov cl,24
R:fstp st3           ;|x z y

R2:
  fld st1            ;|z x z y                     ;|x sz x cz y
  fild word[-6+di-DI_]
  fidiv word[-16+di-DI_]  ;|t=T/256
  fsincos            ;|c=cos(t) s=sin(t) z x z y   ;|c s x sz x cz y
  fmulp st4          ;|s z x cz y                  ;|s x sz cx cz y
  fmulp              ;|sz x cz y                   ;|sx sz cx cz y
  cmc
  jc R2
  faddp st3          ;|sz cx cz+sx y
  fsubp              ;|new.y=cx-sz .x=cz+sx .z=y
  loop R

;  mov cl,24
;R:fld st2            ;|z y x z                     ;|x sz y x cz
;  fild word[-6+di-DI_]
;BIG: ;=30174
;  fidiv word[-16+di-DI_]  ;|t=T/256
;  fsincos            ;|c=cos(t) s=sin(t) z y x z   ;|c s x sz y x cz
;  fmulp st5          ;|s z y x cz                  ;|s x sz y cx cz
;  fmulp              ;|sz y x cz                   ;|sx sz y cx cz
;  cmc
;  jc R
;  faddp st4          ;|sz y cx cz+sx
;  fsubp st2          ;|y cx-sz cz+sx
;  fstp st3           ;|new.y=cx-sz .x=cz+sx .z=y
;  loop R

  fstp dword[bp+si]
  fstp dword[bp+si+4]
  fstp dword[bp+si+8]

  popa
  loop G
  popa

  mov dx,0xA000-10-20-20-4
  mov es,dx    ; dx:bx=YX:XX = 0x9fca:0

; the visible pixels are A0000..AF9FF, I want X=0 Y=0 in the center
;Each pixel: cx=T dx:bx=YX:XX(init=9fca:0) di=adr(init=-4)
X:inc dx       ; part of "dx:bx += 0x0000CCCD"
X2:
  stosb

  pusha        ; adr:     -18 -16 -14 -12 -10  -8  -6  -4  -2
  fninit       ; stack:    di  si  bp  sp  bx  dx  cx  ax   0
  mov bx,es    ; s16:  pixadr 100 9??  -2  ..X..Y  T result
  mov di,-4 ;di = address of pushed ax

;Compute ray direction.
  fild word [byte BIG+si-100h]  ; store 30K as a double, read as two floats
  fst qword[bx+di]  ; t_front@float[bx+di] = 0, t_back@float[bx] = 6.93
  fild word[di+4-9]
  fild word[di+4-8]  ;|y=Y x=X z=BIG

;Intersect the gem.
  call GEM
  popa         ; color -> pushed ax
  mov al,ah

;  mov al,dl    ; test - show only palette

;; Faster, but lower quality: draw each pixel twice.
;  stosb
;  add bx,0xCCCD; dx:bx = YXX += 0000CCCD
;  adc dx,0

  add bx,0xCCCD; dx:bx = YXX += 0000CCCD
  jnc X2
  jnz X        ; do 65536 pixels

  in al,60h
  dec al
  loopnz M        ; T--
;  ret          ; fallthrough

GEM:
;; Faster: intersect the gem only in the center of the screen
;  add dh,dh
;  jo B
;  add dl,dl
;  jo B
  
;Hit the gem.
  mov cx,[si]  ; i@cx = 12...1
;Ray-plane intersection.
;Find the front plane with maximum t and back plane with minimum t.
; tf@[bx+di],    tb@[bx]      ray parameter t
; pf@[bx+di+si], pb@[bx+si]   pointer to plane
I:add bp,si    ; bp points to p[i]
  fldlg2             ;|pd=0.301 y x z
  fadd dword[bp+si+8];|N=pd-(ro|p[i]) y x z  ; ro = 0 0 -1

  add si,[si]  ; Dot product:
D:add si,di    ; 108 104 100
  fld dword[bp+si]   ;|p[i].z ...
  fmul st4           ;|rd.z*p[i].z ...
  jpo D              ;|(rd*p[i]).y .x .z N rd.y .x .z
  faddp
  faddp              ;|D=(rd|p[i]) N y x z
  
;If we hit the plane from the front (D<0), update tf. Otherwise update tb.
  push bx
  fst dword[bp+di]; -> p[i].dot_rd
  test [bp+di+2],sp ; sf=1 if we're in front of the plane
  jns FRONT
  add bx,di    ; bx = address of tf?tb
FRONT:         ; D<0:  if tf*D < N { tf=N/D; pf=current; }  maximalize tf
  fld st0      ; D>=0: if tb*D < N { tb=N/D; pb=current; }  minimalize tb
  fmul dword[bx]     ;|(tf?tb)*D D N y x z

;  fcomp st2          ;|D N y x z    ;compatible version
;  fnstsw ax
;  sahf         ; cf = (tf?tb)*D < N
  fcomip st2

  jc NEXT
  fdivr st1          ;|t=N/D N y x z
  fst dword[bx] ; -> tf?tb
  mov [bx+si],bp ; pf?pb = current
NEXT:
  fcompp
  pop bx             ;|y x z

  mov eax,[bx+di]
  cmp eax,[bx]  ; if tf>tb { no_hit: early exit }
  jg B                ;|y x z

  loop I

;Reflect from the gem: reflect(i,n) = i - 2*n*(i|n)
  mov bx,[bx+si-4] ; pf
  add si,[si]
Y:add si,di   ;108 104 100
  fld dword[bx+di]   ;|(rd|pf) rd.y .x .z  ; reads pf->dot_rd
  fmul dword[bx+si]  ;|(rd|pf)*pf.z rd.y .x .z
  fadd st0           ;|2*(rd|pf)*pf.z rd.y .x .z
  fsubr st3          ;|R.z=rd.z-2*(rd|pf)*pf.z rd.y .x .z
  jpo Y              ;|(R=i-2*n(i|n)).y R.x R.z rd.y .x .z

;Environment map: chessboard below, sky gradient above.
B:fist word[di]      ;|y x z
  test [di],sp  ; if y>=-0.5 { chessboard } else { sky }
  js E          ; the sky is just y (= y^2 after gamma)
  
  fidivr word[si]    ;|12/y x z
  fmul st1,st0
  fmul st2           ;|u=z*12/y v=x*12/y z

  fistp word[bp+di]
  mov al,[bp+di]
  sub al,[di+4-6]
  fistp word[bp+di]
  xor al,[bp+di]  ; xortex@ax = (u-T) XOR v
  and al,9
  add al,5        ; tex = (xortex AND 0b1001) + 5  [5|6|13|14]
  shl ax,12
  mul word[di]
  mov [di],dx     ; pushed ax = tex*y/16
  
E:ret
